cont <- params$controlCellLine
exp <- params$experimentalCellLine
countmatrix.all <- params$countmatrix.all
metadata.all <- params$metadata.all
rm(params) # Remove the parameters so that we can make subsequent parameterized calls
metadata.pair <- as.data.frame(metadata.all) %>%
filter(CellLine == cont | CellLine == exp)
as.data.frame(metadata.pair)
countmatrix.pair <- countmatrix.all[, metadata.pair$ShortName]
as.data.frame(countmatrix.pair)
Run Deseq on the data set.
# Saving time by just loading the dds we already ran (recent changes are all after this point)
load(str_interp("Rdata/${exp}_vs_${cont}_dds.RData"))
res <- results(dds.pair, contrast = c("CellLine", exp, cont), alpha = 0.05)
res <- res[order(res$log2FoldChange), ]
outFile <- str_interp("output/${exp}_vs_${cont}_deseq_results.csv")
write.csv(as.data.frame(res), file = outFile)
Filter res for padj < 0.05
res.filtered <- as.data.frame(res) %>%
filter(padj < 0.05)
# filter(log2FoldChange >= 1.5 | log2FoldChange <= -1.5)
res.filtered <- res.filtered[order(res.filtered$log2FoldChange, decreasing = TRUE),]
res.filtered
up.unfiltered <- subset(res, log2FoldChange > 0)
up.unfiltered <- up.unfiltered[order(up.unfiltered$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_upregulated_genes.csv")
write.csv(up.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
up.unfiltered[, c("log2FoldChange", "padj")]
log2 fold change (MLE): CellLine PEO4 vs PEO1
DataFrame with 10555 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
RNU1-1 21.6996 2.86087e-07
CD302 10.9316 2.64992e-18
LOC102724560 10.6306 2.81208e-17
LRRTM1 10.5750 4.86434e-17
C3orf14 10.0030 1.34060e-15
... ... ...
TMEM106B 4.69566e-04 0.998649
HIBADH 2.98925e-04 0.999160
NDUFA2 2.14142e-04 0.999470
LOC102724828 2.12234e-04 0.999402
PSMD10 1.61526e-05 0.999970
up <- subset(res.filtered, log2FoldChange > 0)
up <- up[order(up$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_genes.csv")
write.csv(up[, c("log2FoldChange", "padj")], file = outFile)
print(up[, c("log2FoldChange", "padj")])
down.unfiltered <- subset(res, log2FoldChange < 0)
down.unfiltered <- down.unfiltered[order(down.unfiltered$log2FoldChange, decreasing = FALSE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_all_downregulated_genes.csv")
write.csv(down.unfiltered[, c("log2FoldChange", "padj")], file = outFile)
print(down.unfiltered[, c("log2FoldChange", "padj")])
log2 fold change (MLE): CellLine PEO4 vs PEO1
DataFrame with 10507 rows and 2 columns
log2FoldChange padj
<numeric> <numeric>
LOC102724441 -25.1078 1.82153e-09
HOXC10 -12.5672 2.32202e-24
EPB41L3 -12.0057 3.89779e-22
CDH4 -11.1862 3.98239e-19
PROM1 -11.0903 3.53500e-207
... ... ...
CRIP3 -6.92803e-04 0.999937
IQSEC3 -3.53737e-04 0.999125
CD68 -2.02510e-04 0.999402
ABI1 -2.72415e-05 0.999937
WDR65 -1.60112e-05 0.999975
down <- subset(res.filtered, log2FoldChange < 0)
down <- down[order(down$log2FoldChange, decreasing = TRUE), ]
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_genes.csv")
write.csv(down[, c("log2FoldChange", "padj")], file = outFile)
print(down[, c("log2FoldChange", "padj")])
as.data.frame(res) %>%
ggplot(aes(x = log2FoldChange, y = -log10(padj), label = rownames(res))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: Removed 8696 rows containing missing values (`geom_point()`).
Warning: Removed 8696 rows containing missing values (`geom_text_repel()`).
Warning: ggrepel: 21025 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Perform gene set enrichment analysis using Cluster Profiler. This gives us GO pathways that are significantly regulated based on the log2fold change of expression of individual genes.
Using a pvalue Cutoff of 0.05
gene_list <- res$log2FoldChange
names(gene_list) <- rownames(res)
gene_list <- sort(gene_list, decreasing = TRUE)
# Set the seed so our results are reproducible:
set.seed(2023)
gsea_res <- gseGO(gene_list, ont = "BP", OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", seed = TRUE, pvalueCutoff = 0.05)
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (13.19% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
# Format output
gsea_res_df <- as.data.frame(gsea_res)
gsea_res_df <- gsea_res_df %>%
mutate(original_row_num = row_number())
gsea_res_df <- gsea_res_df[order(gsea_res_df$NES, decreasing = TRUE),]
row.names(gsea_res_df) <- gsea_res_df$ID
NES is the normalized enrichment score.
gsea_res_df_short <- gsea_res_df[c("pvalue", "p.adjust", "NES", "Description")]
gsea_res_df_short$"core_enrichment_genes" <- gsea_res_df$core_enrichment
gsea_res_df_short.up <- subset(gsea_res_df_short, gsea_res_df_short$NES >= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_upregulated_pathways.csv")
write.csv(gsea_res_df_short.up, file = outFile)
gsea_res_df_short.up
GSEA plot of the five most upregulated pathways (or least downregulated)
maxIndex <- min(5, nrow(gsea_res_df)) # Prevents us from trying to access out of bounds if there are not five pathways
top5PathwaysIds = gsea_res_df[1:maxIndex, "original_row_num"]
gseaplot2(gsea_res, geneSetID = top5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano Plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.up) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.up))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gsea_res_df_short.down <- subset(gsea_res_df_short, gsea_res_df_short$NES <= 0)
outFile <- str_interp("output/${exp}_vs_${cont}_significantly_downregulated_pathways.csv")
write.csv(gsea_res_df_short.down, file = outFile)
gsea_res_df_short.down
GSEA plot of the five most downregulated pathways (or least upregulated)
minIndex <- max(1, nrow(gsea_res_df) - 5) # Prevents us from trying to access out of bounds if there are not five downregulated pathways
bottom5PathwaysIds = gsea_res_df[minIndex:nrow(gsea_res_df), "original_row_num"]
gseaplot2(gsea_res, geneSetID = bottom5PathwaysIds, pvalue_table = FALSE, ES_geom = "dot")
Volcano plot (Average NES & adjusted p value)
as.data.frame(gsea_res_df_short.down) %>%
ggplot(aes(x = NES, y = -log10(p.adjust), label = rownames(gsea_res_df_short.down))) +
geom_point() +
theme_minimal() +
scale_color_manual(values = c("black", "blue", "red")) +
geom_text_repel() +
geom_hline(yintercept = 1.301) +
geom_vline(xintercept = 1.2) +
geom_vline(xintercept = -1.2) +
xlim(-10, 10)
Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider increasing
max.overlaps
Use Revigo to cluster upregulated pathways
revigo_input.cellline.up <- gsea_res_df_short.up[c("p.adjust")]
rownames(revigo_input.cellline.up) <- rownames(gsea_res_df_short.up)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.up),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
Warning in calculateSimMatrix(rownames(revigo_input.cellline.up), orgdb = "org.Hs.eg.db", :
Removed 1 terms that were not found in orgdb for BP
scores <- setNames(-log10(revigo_input.cellline.up$p.adjust), rownames(revigo_input.cellline.up))
if (nrow(revigo_input.cellline.up) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly upregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}
Use Revigo to cluster downregulated pathways
revigo_input.cellline.down <- gsea_res_df_short.down[c("p.adjust")]
rownames(revigo_input.cellline.down) <- rownames(gsea_res_df_short.down)
simMatrix <- calculateSimMatrix(rownames(revigo_input.cellline.down),
orgdb = "org.Hs.eg.db",
ont = "BP",
method = "Rel"
)
preparing gene to GO mapping data...
preparing IC data...
scores <- setNames(-log10(revigo_input.cellline.down$p.adjust), rownames(revigo_input.cellline.down))
if (nrow(revigo_input.cellline.down) > 1) {
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold = 0.7,
orgdb = "org.Hs.eg.db"
)
} else {
reducedTerms <- data.frame(matrix(ncol = 0, nrow = 0))
print("There will be no graphs appearing below this because there were not enough significantly downregulated pathways to meaningfully cluster them")
}
Revigo interactive scatter plot. Distances represent the similarity between terms, axes are the first 2 components of a PCA plot, Each bubble indicates the representative (chosen mostly by p-value) from a cluster of terms. Size of the bubble indicates the generality of the term (large meaning a more general term).
if (nrow(reducedTerms) > 2) {
revigo_scatterplot(simMatrix, reducedTerms)
}
Revigo heatmap plot. Similar terms clustered
if (nrow(reducedTerms) > 2) {
heatmapPlot(simMatrix,
reducedTerms,
annotateParent = TRUE,
annotationLabel = "parentTerm",
fontsize = 6
)
}
This is the same content, but interactive.
if (nrow(reducedTerms) > 2) {
revigo_heatmap(simMatrix, reducedTerms)
}
Warning: Specifying width/height in layout() is now deprecated.
Please specify in ggplotly() or plot_ly()
Revigo treemap plot. Terms grouped/colored based on parent. Space is proportional to statistical significance of the GO term (-log10(pvalue)).
if (nrow(reducedTerms) > 2) {
treemapPlot(reducedTerms)
}